Validation

source(here::here("src/funs.R"))
source(here::here("validation/cs_process.R"))
## ── Attaching packages ────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.1     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## here() starts at /Users/Lauren/Documents/R projects/OV_Histotypes
## Warning: 867 parsing failures.
##  row                  col   expected    actual                                                                  file
## 1362 review_grade         a double   n/a       '/Users/Lauren/Documents/R projects/OV_Histotypes/data-raw/annot.csv'
## 2973 reviewCompletionDate date like  2014-08-8 '/Users/Lauren/Documents/R projects/OV_Histotypes/data-raw/annot.csv'
## 2974 reviewCompletionDate date like  2014-08-8 '/Users/Lauren/Documents/R projects/OV_Histotypes/data-raw/annot.csv'
## 3541 reviewCompletionDate date like  2014-08-8 '/Users/Lauren/Documents/R projects/OV_Histotypes/data-raw/annot.csv'
## 3542 reviewCompletionDate date like  2014-08-8 '/Users/Lauren/Documents/R projects/OV_Histotypes/data-raw/annot.csv'
## .... .................... .......... ......... .....................................................................
## See problems(...) for more details.
cs1_class <- readRDS(here::here("data/cs1_class.rds"))
cs2_class <- readRDS(here::here("data/cs2_class.rds"))

Concordance

First we validate the CS2 normalization process by looking at the distribution of CS3 non-normalized samples with:

  • CS2 non-normalized
  • CS2 pools-normalized
  • CS2 reference-normalized
# Dataset Split -----------------------------------------------------------

# CS2 gene order
cs2_genes <- names(cs2_clean)[-1:-2]

# Split CS2 common samples into Reference and Validation sets
# Reference set: duplicate samples
cs2_ref <- cs2_clean %>%
  dplyr::filter(duplicated(ottaID) | duplicated(ottaID, fromLast = TRUE)) %>%
  dplyr::select(-FileName)

# Validation set: unique samples
cs2_val <- dplyr::anti_join(cs2_clean, cs2_ref, by = "ottaID") %>%
  dplyr::select(-FileName)


# No Normalization --------------------------------------------------------

# Sort CS2 validation set without normalization to preserve sample order
cs2_val_norm0 <- dplyr::arrange(cs2_val, ottaID)

# Extract CS3 samples corresponding to CS2 validation set without normalization
# Averaged gene expression within duplicates
cs3_val_norm0 <- cs3_clean %>%
  dplyr::select(names(cs2_val_norm0)) %>%
  dplyr::semi_join(cs2_val_norm0, by = "ottaID") %>%
  dplyr::group_by(ottaID) %>%
  dplyr::summarize_if(is.double, mean)


# Pool Method -------------------------------------------------------------

# Pool set: pool samples from CS2
pool_samples <-
  gsub("-|\\.RCC", "", grep("POOL", pools[["CS2-FileName"]], value = TRUE))
cs2_pools <- dplyr::select(cs2_norm, Name, paste0("X", pool_samples))

# Locked down reference pools weights
weights <- c("Pool1", "Pool2", "Pool3") %>%
  purrr::set_names() %>%
  purrr::map_dbl(~ mean(grepl(., names(ref_pools), ignore.case = TRUE)))

# Weighted mean gene expression for CS2 pools (using reference pools weights)
cs2_pools_mgx <- weights %>%
  purrr::imap_dfc(~ .x * rowMeans(dplyr::select(cs2_pools, dplyr::matches(.y)))) %>%
  dplyr::transmute(Name = cs2_pools[["Name"]], cs2_exp = rowSums(.))

# Mean gene expression for CS3 reference pools
cs3_pools_mgx <-
  tibble::enframe(rowMeans(ref_pools), name = "Name", value = "cs3_exp")

# Transposed validation set
cs2_val_t <- cs2_val %>%
  tidyr::gather(Name, value, -1) %>%
  tidyr::spread(ottaID, value)

# Normalize each gene by adding batch effect (diff in mean gx)
cs2_val_norm1 <-
  dplyr::inner_join(cs3_pools_mgx, cs2_pools_mgx, by = "Name") %>%
  dplyr::transmute(Name, be = cs3_exp - cs2_exp) %>%
  dplyr::inner_join(cs2_val_t, by = "Name") %>%
  tidyr::gather(ottaID, exp, -1:-2) %>%
  dplyr::transmute(Name = factor(Name, levels = cs2_genes), ottaID, exp = be + exp) %>%
  tidyr::spread(Name, exp)


# Reference Method --------------------------------------------------------

# Corresponding samples in CS3 found in CS2 reference set
cs3_ref <- cs3_clean %>%
  dplyr::semi_join(cs2_ref, by = "ottaID") %>%
  dplyr::select(cs2_genes)
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(cs2_genes)` instead of `cs2_genes` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
# Remove non-numeric column names
cs2_ref2 <- dplyr::select_if(cs2_ref, is.double)
cs2_val2 <- dplyr::select_if(cs2_val, is.double)

# Normalize by reference method using reference samples
cs2_val_norm2 <- nanostringr::refMethod(cs2_val2, cs2_ref2, cs3_ref) %>%
  tibble::as_tibble() %>%
  tibble::add_column(ottaID = cs2_val[["ottaID"]], .before = 1) %>%
  dplyr::arrange(ottaID)
# Method Comparisons ------------------------------------------------------

# Compare cs2_val_norm0, cs2_val_norm1, cs2_val_norm2, with cs3_val_norm0
cs_combined <-
  tibble::lst(cs2_val_norm0, cs2_val_norm1, cs2_val_norm2, cs3_val_norm0) %>%
  dplyr::bind_rows(.id = "dataset") %>%
  tidyr::gather(gene, exp, -1:-2, factor_key = TRUE) %>%
  tidyr::spread(dataset, exp)

# Common plotting layers
n_samples <- dplyr::n_distinct(cs_combined[["ottaID"]])
layers <- list(
  geom_point(alpha = 0.3),
  geom_abline(slope = 1, intercept = 0, color = "blue"),
  facet_wrap(~ gene, nrow = 8, ncol = 9),
  labs(
    y = "CS3 No Normalization",
    title = paste0("CS2 Validation Set (n=", n_samples,
                   ") vs. Corresponding CS3 Samples")
  )
)
# Scatterplots of each CS2 dataset vs CS3
p0 <- ggplot(cs_combined, aes(cs2_val_norm0, cs3_val_norm0)) +
  xlab("CS2 No Normalization") +
  layers
print(p0)
Gene Expression CS2 No Normalization vs. CS3

Gene Expression CS2 No Normalization vs. CS3

p1 <- ggplot(cs_combined, aes(cs2_val_norm1, cs3_val_norm0)) +
  xlab("CS2 Pools Normalization") +
  layers
print(p1)
Gene Expression CS2 Pools Normalization vs. CS3

Gene Expression CS2 Pools Normalization vs. CS3

p2 <- ggplot(cs_combined, aes(cs2_val_norm2, cs3_val_norm0)) +
  xlab("CS2 Reference Normalization") +
  layers
print(p2)
Gene Expression CS2 Reference Normalization vs. CS3

Gene Expression CS2 Reference Normalization vs. CS3

# Concordance Histograms --------------------------------------------------

# Concordance measures between samples of same gene, for each dataset comparison
all_metrics <- cs_combined %>%
  tidyr::gather(Methods, cs2_val_norm, dplyr::matches("cs2")) %>%
  dplyr::mutate(
    Methods = dplyr::recode_factor(
      Methods,
      cs2_val_norm0 = "CS2 None vs. CS3 None",
      cs2_val_norm1 = "CS2 Pools vs. CS3 None",
      cs2_val_norm2 = "CS2 Common vs. CS3 None"
    )
  ) %>%
  dplyr::group_by(Methods, ottaID) %>%
  dplyr::summarize(
    R2 = cor(cs2_val_norm, cs3_val_norm0) ^ 2,
    Ca = epiR::epi.ccc(cs2_val_norm, cs3_val_norm0)[["C.b"]],
    Rc = epiR::epi.ccc(cs2_val_norm, cs3_val_norm0)[["rho.c"]][["est"]]
  ) %>%
  dplyr::ungroup() %>%
  tidyr::gather(Metric, Expression, c("R2", "Ca", "Rc"), factor_key = TRUE)
## `summarise()` regrouping output by 'Methods' (override with `.groups` argument)
# Plot all combinations of cross-dataset concordance measure histograms
p <- ggplot(all_metrics, aes(Expression)) +
  geom_histogram(bins = 30, fill = "blue") +
  facet_grid(rows = vars(Methods), cols = vars(Metric), scales = "free_x") +
  labs(y = "Count",
       title = "CS2 Datasets vs. CS3 Concordance Measure Distributions") +
  theme_bw() +
  theme(panel.grid.minor = element_blank())
print(p)
Concordance Histograms

Concordance Histograms

)

Full Data Distributions

The histotype distributions on the full data are shown below.

# Histotype Distributions -------------------------------------------------
all_hist <- hist %>% dplyr::filter(!is.na(revHist))
all_hist %>%
  dplyr::count(CodeSet, hist_gr) %>%
  tidyr::spread(CodeSet, n, fill = 0L) %>% 
  knitr::kable(caption = "All CodeSet Histotype Groups")
All CodeSet Histotype Groups
hist_gr CS1 CS2 CS3
HGSC 169 757 2453
non-HGSC 196 377 677
all_counts <- all_hist %>% dplyr::count(CodeSet, revHist)
all_counts %>%
  tidyr::spread(CodeSet, n, fill = 0L) %>% 
  knitr::kable(caption = "All CodeSet Histotypes")
All CodeSet Histotypes
revHist CS1 CS2 CS3
CARCINOMA-NOS 0 61 23
Carcinoma, NOS 0 0 2
CCOC 57 68 182
CCOC-MCT 0 1 0
Cell-Line 17 48 13
CTRL 0 12 0
ENOC 61 30 272
ENOC-CCOC 0 7 0
ERROR 0 3 0
HGSC 169 757 2453
HGSC-MCT 0 1 0
LGSC 22 29 50
MBOT 0 20 3
MET-NOP 0 21 0
MIXED (ENOC/CCOC) 0 0 1
MIXED (ENOC/LGSC) 0 0 1
MIXED (HGSC/CCOC) 0 0 1
mixed cell 0 0 7
MMMT 0 0 30
MUC 20 61 77
Other (use when 6, 7, or 9 is not distinguished) or unknown if epithelial 0 0 1
Other/Exclude 0 0 8
SBOT 19 10 3
Serous 0 0 2
serous LMP 0 0 1
SQAMOUS 0 1 0
UNK 0 4 0
common_counts <- all_hist %>%
  dplyr::filter(FileName %in% common_samples) %>% 
  dplyr::count(CodeSet, revHist)
common_counts %>% 
  tidyr::spread(CodeSet, n, fill = 0L) %>% 
  knitr::kable(caption = "Common Summary ID CodeSet Histotypes")
Common Summary ID CodeSet Histotypes
revHist CS1 CS2 CS3
CCOC 3 4 9
Cell-Line 4 5 5
ENOC 4 4 9
HGSC 68 64 98
LGSC 7 5 8
MUC 7 5 11
all_maj_counts <- all_hist %>% 
  dplyr::filter(revHist %in% c("HGSC", "LGSC", "ENOC", "CCOC", "MUC")) %>% 
  dplyr::count(CodeSet, revHist)
all_maj_counts %>% 
  tidyr::spread(CodeSet, n, fill = 0L) %>% 
  dplyr::mutate(CS1_percent = CS1 /sum(CS1) * 100, CS1_percent = round(CS1_percent,1))  %>% 
  dplyr::mutate(CS2_percent = CS2 /sum(CS2) * 100, CS2_percent = round(CS2_percent,1))  %>% 
  dplyr::mutate(CS3_percent = CS3 /sum(CS3) * 100, CS3_percent = round(CS3_percent,1))  %>% 
  knitr::kable(caption = "All CodeSet Major Histotypes")
All CodeSet Major Histotypes
revHist CS1 CS2 CS3 CS1_percent CS2_percent CS3_percent
CCOC 57 68 182 17.3 7.2 6.0
ENOC 61 30 272 18.5 3.2 9.0
HGSC 169 757 2453 51.4 80.1 80.9
LGSC 22 29 50 6.7 3.1 1.6
MUC 20 61 77 6.1 6.5 2.5
all_counts %>% 
  dplyr::filter(CodeSet == "CS1") %>% 
  knitr::kable(caption = "CS1 Histotypes")
CS1 Histotypes
CodeSet revHist n
CS1 CCOC 57
CS1 Cell-Line 17
CS1 ENOC 61
CS1 HGSC 169
CS1 LGSC 22
CS1 MUC 20
CS1 SBOT 19
all_counts %>% 
  dplyr::filter(CodeSet == "CS2") %>% 
  knitr::kable(caption = "CS2 Histotypes")
CS2 Histotypes
CodeSet revHist n
CS2 CARCINOMA-NOS 61
CS2 CCOC 68
CS2 CCOC-MCT 1
CS2 Cell-Line 48
CS2 CTRL 12
CS2 ENOC 30
CS2 ENOC-CCOC 7
CS2 ERROR 3
CS2 HGSC 757
CS2 HGSC-MCT 1
CS2 LGSC 29
CS2 MBOT 20
CS2 MET-NOP 21
CS2 MUC 61
CS2 SBOT 10
CS2 SQAMOUS 1
CS2 UNK 4
all_counts %>% 
  dplyr::filter(CodeSet == "CS3") %>% 
  knitr::kable(caption = "CS3 Histotypes")
CS3 Histotypes
CodeSet revHist n
CS3 CARCINOMA-NOS 23
CS3 Carcinoma, NOS 2
CS3 CCOC 182
CS3 Cell-Line 13
CS3 ENOC 272
CS3 HGSC 2453
CS3 LGSC 50
CS3 MBOT 3
CS3 MIXED (ENOC/CCOC) 1
CS3 MIXED (ENOC/LGSC) 1
CS3 MIXED (HGSC/CCOC) 1
CS3 mixed cell 7
CS3 MMMT 30
CS3 MUC 77
CS3 Other (use when 6, 7, or 9 is not distinguished) or unknown if epithelial 1
CS3 Other/Exclude 8
CS3 SBOT 3
CS3 Serous 2
CS3 serous LMP 1

Training Set Distributions

The training set distributions for CS1 and CS2 are shown below.

cs1_class %>% 
  tibble::enframe(value = "histotype") %>% 
  dplyr::count(histotype) %>% 
  knitr::kable(caption = "CS1 Training Set Histotypes")
CS1 Training Set Histotypes
histotype n
CCC 57
ENOCa 59
HGSC 156
LGSC 16
MUC 16
cs2_class %>% 
  tibble::enframe(value = "histotype") %>% 
  dplyr::count(histotype) %>% 
  knitr::kable(caption = "CS2 Training Set Histotypes")
CS2 Training Set Histotypes
histotype n
CCOC 68
ENOC 30
HGSC 757
LGSC 29
MUC 61

Normalization

source(here::here("validation/cs_process_cohorts.R"))
## Warning: 867 parsing failures.
##  row                  col   expected    actual                                                                  file
## 1362 review_grade         a double   n/a       '/Users/Lauren/Documents/R projects/OV_Histotypes/data-raw/annot.csv'
## 2973 reviewCompletionDate date like  2014-08-8 '/Users/Lauren/Documents/R projects/OV_Histotypes/data-raw/annot.csv'
## 2974 reviewCompletionDate date like  2014-08-8 '/Users/Lauren/Documents/R projects/OV_Histotypes/data-raw/annot.csv'
## 3541 reviewCompletionDate date like  2014-08-8 '/Users/Lauren/Documents/R projects/OV_Histotypes/data-raw/annot.csv'
## 3542 reviewCompletionDate date like  2014-08-8 '/Users/Lauren/Documents/R projects/OV_Histotypes/data-raw/annot.csv'
## .... .................... .......... ......... .....................................................................
## See problems(...) for more details.
# Pairwise CodeSet comparisons
codesets <- c("CS1", "CS2", "CS3")
all_codesets <- combn(codesets, 2) %>%
  as_tibble() %>%
  set_names(map_chr(., paste, collapse = "_vs_"))
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
## Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
cs1_raw_avg <- cs1 %>%
  rename_all(~ gsub("^X", "", .)) %>%
  filter(Name %in% common_genes) %>%
  select_if(names(.) %in% c("Name", common_samples)) %>%
  mutate(Name = fct_inorder(Name)) %>%
  gather(FileName, value, -Name) %>%
  inner_join(hist, by = "FileName") %>%
  spread(Name, value) %>%
  select(FileName, ottaID, all_of(common_genes)) %>% 
  group_by(ottaID) %>% 
  summarize_if(is.double, mean) %>% 
  ungroup() %>% 
  column_to_rownames("ottaID")

cs2_raw_avg <- cs2 %>%
  rename_all(~ gsub("^X", "", .)) %>%
  filter(Name %in% common_genes) %>%
  select_if(names(.) %in% c("Name", common_samples)) %>%
  mutate(Name = fct_inorder(Name)) %>%
  gather(FileName, value, -Name) %>%
  inner_join(hist, by = "FileName") %>%
  spread(Name, value) %>%
  select(FileName, ottaID, all_of(common_genes)) %>% 
  group_by(ottaID) %>% 
  summarize_if(is.double, mean) %>% 
  ungroup() %>% 
  column_to_rownames("ottaID")

cs3_raw_avg <- cs3 %>%
  rename_all(~ gsub("^X", "", .)) %>%
  filter(Name %in% common_genes) %>%
  select_if(names(.) %in% c("Name", common_samples)) %>%
  mutate(Name = fct_inorder(Name)) %>%
  gather(FileName, value, -Name) %>%
  inner_join(hist, by = "FileName") %>%
  spread(Name, value) %>%
  select(FileName, ottaID, all_of(common_genes)) %>% 
  group_by(ottaID) %>% 
  summarize_if(is.double, mean) %>% 
  ungroup() %>% 
  column_to_rownames("ottaID")

raw_list <-
  set_names(list(cs1_raw_avg, cs2_raw_avg, cs3_raw_avg), codesets)
metrics_raw <- all_codesets %>%
  imap_dfr(~ {
    pmap_dfr(raw_list[.x], ~ cor_stats(.x, .y)) %>% mutate(Sites = .y)
  }) %>%
  gather(key = "Metric", value = "Expression", -Sites) %>%
  group_by(Sites, Metric) %>%
  mutate(Median = paste0("Median = ", scales::number(median(Expression), accuracy = 0.01))) %>%
  ungroup()

# Plot all combinations of cross-codeset concordance measure histograms
p_raw <- ggplot(metrics_raw, aes(Expression)) +
  geom_histogram(bins = 30, fill = "blue") +
  geom_text(aes(x = 0, y = 40, label = Median),
            hjust = 0,
            vjust = 1,
            check_overlap = TRUE) +
  facet_grid(rows = vars(Sites), cols = vars(Metric), scales = "free_x") +
  labs(y = "Count",
       title = "Raw Non-Normalized Concordance Measure Distributions") +
  theme_bw() +
  theme(panel.grid.minor = element_blank())
print(p_raw)

cs1_hknorm_avg <- cs1_norm %>%
  rename_all(~ gsub("^X", "", .)) %>%
  filter(Name %in% common_genes) %>%
  select_if(names(.) %in% c("Name", common_samples)) %>%
  mutate(Name = fct_inorder(Name)) %>%
  gather(FileName, value, -Name) %>%
  inner_join(hist, by = "FileName") %>%
  spread(Name, value) %>%
  select(FileName, ottaID, all_of(common_genes)) %>% 
  group_by(ottaID) %>% 
  summarize_if(is.double, mean) %>% 
  ungroup() %>% 
  column_to_rownames("ottaID")

cs2_hknorm_avg <- cs2_norm %>%
  rename_all(~ gsub("^X", "", .)) %>%
  filter(Name %in% common_genes) %>%
  select_if(names(.) %in% c("Name", common_samples)) %>%
  mutate(Name = fct_inorder(Name)) %>%
  gather(FileName, value, -Name) %>%
  inner_join(hist, by = "FileName") %>%
  spread(Name, value) %>%
  select(FileName, ottaID, all_of(common_genes)) %>% 
  group_by(ottaID) %>% 
  summarize_if(is.double, mean) %>% 
  ungroup() %>% 
  column_to_rownames("ottaID")

cs3_hknorm_avg <- cs3_norm %>%
  rename_all(~ gsub("^X", "", .)) %>%
  filter(Name %in% common_genes) %>%
  select_if(names(.) %in% c("Name", common_samples)) %>%
  mutate(Name = fct_inorder(Name)) %>%
  gather(FileName, value, -Name) %>%
  inner_join(hist, by = "FileName") %>%
  spread(Name, value) %>%
  select(FileName, ottaID, all_of(common_genes)) %>% 
  group_by(ottaID) %>% 
  summarize_if(is.double, mean) %>% 
  ungroup() %>% 
  column_to_rownames("ottaID")

hknorm_list <-
  set_names(list(cs1_hknorm_avg, cs2_hknorm_avg, cs3_hknorm_avg), codesets)
metrics_hknorm <- all_codesets %>%
  imap_dfr(~ {
    pmap_dfr(hknorm_list[.x], ~ cor_stats(.x, .y)) %>% mutate(Sites = .y)
  }) %>%
  gather(key = "Metric", value = "Expression", -Sites) %>%
  group_by(Sites, Metric) %>%
  mutate(Median = paste0("Median = ", scales::number(median(Expression), accuracy = 0.01))) %>%
  ungroup()

# Plot all combinations of cross-codeset concordance measure histograms
p_hknorm <- ggplot(metrics_hknorm, aes(Expression)) +
  geom_histogram(bins = 30, fill = "blue") +
  geom_text(aes(x = 0, y = 40, label = Median),
            hjust = 0,
            vjust = 1,
            check_overlap = TRUE) +
  facet_grid(rows = vars(Sites), cols = vars(Metric), scales = "free_x") +
  labs(y = "Count",
       title = "HK genes Normalized Concordance Measure Distributions") +
  theme_bw() +
  theme(panel.grid.minor = element_blank())
print(p_hknorm)

Common Samples Method

We employ a new normalization technique using randomly selected samples common to all three CodeSets with a uniform distribution of histotypes as the reference dataset. The number of randomly selected samples ranges from 1-3 per histotype. Hence, the reference dataset has either 5, 10, or 15 samples and we validate on the remaining. Note that ottaID duplicates are collapsed by mean averaging the gene expression. n=72 common samples.

CodeSets 1 and 2 are calibrated to CodeSet3 as follows:
X^1(norm) = X^1 - R^1 + R^3
X^2(norm) = X^2 - R^2 + R^3
X^3(norm) = X^3

# Random selection of common samples with equal number of histotypes
set.seed(2020)
hist_rand3 <- hist %>%
  filter(ottaID %in% unique(c(
    cs1_clean$ottaID, cs2_clean$ottaID, cs3_clean$ottaID
  ))) %>%
  distinct(ottaID, revHist) %>%
  group_by(revHist) %>%
  slice_sample(n = 3) %>%
  ungroup()

# Gene expression from random common samples, preserving gene order
cs1_rand3 <- join_avg(cs1_clean, hist_rand3, "ottaID", "keep")
cs2_rand3 <- join_avg(cs2_clean, hist_rand3, "ottaID", "keep")
cs3_rand3 <- join_avg(cs3_clean, hist_rand3, "ottaID", "keep")

# Remove common samples from CS1, preserving gene order
cs1_counts3 <- join_avg(cs1_clean, hist_rand3, "ottaID", "discard")
cs2_counts3 <- join_avg(cs2_clean, hist_rand3, "ottaID", "discard")
cs3_counts3 <- join_avg(cs3_clean, hist_rand3, "ottaID", "discard")

# Normalize by reference method using common samples, add histotypes from annot
cs1_norm_rand3 <-
  refMethod(cs1_counts3, cs3_rand3, cs1_rand3) %>%
  as.data.frame() %>%
  rownames_to_column("ottaID") %>%
  inner_join(hist %>% distinct(ottaID, revHist), by = "ottaID") %>%
  column_to_rownames("ottaID")

cs2_norm_rand3 <-
  refMethod(cs2_counts3, cs3_rand3, cs2_rand3) %>%
  as.data.frame() %>%
  rownames_to_column("ottaID") %>%
  inner_join(hist %>% distinct(ottaID, revHist), by = "ottaID") %>%
  column_to_rownames("ottaID")

# Combined gene expression
counts3 <-
  set_names(list(cs1_counts3, cs2_counts3, cs3_counts3), codesets)
norm_rand3 <- list(cs1_norm_rand3, cs2_norm_rand3, cs3_counts3) %>%
  set_names(codesets) %>%
  map_at(1:2, select, -"revHist")

# Concordance measures for all genes averaged across samples
metrics_non3 <- all_codesets %>%
  imap_dfr(~ {
    pmap_dfr(counts3[.x], ~ cor_stats(.x, .y)) %>% mutate(Sites = .y)
  }) %>%
  gather(key = "Metric", value = "Expression", -Sites) %>%
  group_by(Sites, Metric) %>%
  mutate(Median = paste0("Median = ", scales::number(median(Expression), accuracy = 0.01))) %>%
  ungroup()

metrics_rand3 <- all_codesets %>%
  imap_dfr(~ {
    pmap_dfr(norm_rand3[.x], ~ cor_stats(.x, .y)) %>% mutate(Sites = .y)
  }) %>%
  gather(key = "Metric", value = "Expression", -Sites) %>%
  group_by(Sites, Metric) %>%
  mutate(Median = paste0("Median = ", scales::number(median(Expression), accuracy = 0.01))) %>%
  ungroup()
# Plot all combinations of cross-codeset concordance measure histograms
p_non3 <- ggplot(metrics_non3, aes(Expression)) +
  geom_histogram(bins = 30, fill = "blue") +
  geom_text(aes(x = 0, y = 40, label = Median),
            hjust = 0,
            vjust = 1,
            check_overlap = TRUE) +
  facet_grid(rows = vars(Sites), cols = vars(Metric), scales = "free_x") +
  labs(y = "Count",
       title = "Random3 Non-Normalized Concordance Measure Distributions") +
  theme_bw() +
  theme(panel.grid.minor = element_blank())
print(p_non3)
Random3 Non-Normalized Concordance Measure Distributions

Random3 Non-Normalized Concordance Measure Distributions

p_rand3 <- ggplot(metrics_rand3, aes(Expression)) +
  geom_histogram(bins = 30, fill = "blue") +
  geom_text(aes(x = 0, y = 40, label = Median),
            hjust = 0,
            vjust = 1,
            check_overlap = TRUE) +
  facet_grid(rows = vars(Sites), cols = vars(Metric), scales = "free_x") +
  labs(y = "Count",
       title = "Random3 Normalized Concordance Measure Distributions") +
  theme_bw() +
  theme(panel.grid.minor = element_blank())
print(p_rand3)
Random3 Normalized Concordance Measure Distributions

Random3 Normalized Concordance Measure Distributions

# Random selection of common samples with equal number of histotypes
set.seed(2020)
hist_rand2 <- hist %>%
  filter(ottaID %in% unique(c(
    cs1_clean$ottaID, cs2_clean$ottaID, cs3_clean$ottaID
  ))) %>%
  distinct(ottaID, revHist) %>%
  group_by(revHist) %>%
  slice_sample(n = 2) %>%
  ungroup()

# Gene expression from random common samples, preserving gene order
cs1_rand2 <- join_avg(cs1_clean, hist_rand2, "ottaID", "keep")
cs2_rand2 <- join_avg(cs2_clean, hist_rand2, "ottaID", "keep")
cs3_rand2 <- join_avg(cs3_clean, hist_rand2, "ottaID", "keep")

# Remove common samples from CS1, preserving gene order
cs1_counts2 <- join_avg(cs1_clean, hist_rand2, "ottaID", "discard")
cs2_counts2 <- join_avg(cs2_clean, hist_rand2, "ottaID", "discard")
cs3_counts2 <- join_avg(cs3_clean, hist_rand2, "ottaID", "discard")

# Normalize by reference method using common samples, add histotypes from annot
cs1_norm_rand2 <-
  refMethod(cs1_counts2, cs3_rand2, cs1_rand2) %>%
  as.data.frame() %>%
  rownames_to_column("ottaID") %>%
  inner_join(hist %>% distinct(ottaID, revHist), by = "ottaID") %>%
  column_to_rownames("ottaID")

cs2_norm_rand2 <-
  refMethod(cs2_counts2, cs3_rand2, cs2_rand2) %>%
  as.data.frame() %>%
  rownames_to_column("ottaID") %>%
  inner_join(hist %>% distinct(ottaID, revHist), by = "ottaID") %>%
  column_to_rownames("ottaID")

# Combined gene expression
counts2 <-
  set_names(list(cs1_counts2, cs2_counts2, cs3_counts2), codesets)
norm_rand2 <- list(cs1_norm_rand2, cs2_norm_rand2, cs3_counts2) %>%
  set_names(codesets) %>%
  map_at(1:2, select, -"revHist")

# Concordance measures for all genes averaged across samples
metrics_non2 <- all_codesets %>%
  imap_dfr(~ {
    pmap_dfr(counts2[.x], ~ cor_stats(.x, .y)) %>% mutate(Sites = .y)
  }) %>%
  gather(key = "Metric", value = "Expression", -Sites) %>%
  group_by(Sites, Metric) %>%
  mutate(Median = paste0("Median = ", scales::number(median(Expression), accuracy = 0.01))) %>%
  ungroup()

metrics_rand2 <- all_codesets %>%
  imap_dfr(~ {
    pmap_dfr(norm_rand2[.x], ~ cor_stats(.x, .y)) %>% mutate(Sites = .y)
  }) %>%
  gather(key = "Metric", value = "Expression", -Sites) %>%
  group_by(Sites, Metric) %>%
  mutate(Median = paste0("Median = ", scales::number(median(Expression), accuracy = 0.01))) %>%
  ungroup()
# Plot all combinations of cross-codeset concordance measure histograms
p_non2 <- ggplot(metrics_non2, aes(Expression)) +
  geom_histogram(bins = 30, fill = "blue") +
  geom_text(aes(x = 0, y = 40, label = Median),
            hjust = 0,
            vjust = 1,
            check_overlap = TRUE) +
  facet_grid(rows = vars(Sites), cols = vars(Metric), scales = "free_x") +
  labs(y = "Count",
       title = "Random2 Non-Normalized Concordance Measure Distributions") +
  theme_bw() +
  theme(panel.grid.minor = element_blank())
print(p_non2)
Random2 Non-Normalized Concordance Measure Distributions

Random2 Non-Normalized Concordance Measure Distributions

p_rand2 <- ggplot(metrics_rand2, aes(Expression)) +
  geom_histogram(bins = 30, fill = "blue") +
  geom_text(aes(x = 0, y = 50, label = Median),
            hjust = 0,
            vjust = 1,
            check_overlap = TRUE) +
  facet_grid(rows = vars(Sites), cols = vars(Metric), scales = "free_x") +
  labs(y = "Count",
       title = "Random2 Normalized Concordance Measure Distributions") +
  theme_bw() +
  theme(panel.grid.minor = element_blank())
print(p_rand2)
Random2 Normalized Concordance Measure Distributions

Random2 Normalized Concordance Measure Distributions

# Random selection of common samples with equal number of histotypes
set.seed(2020)
hist_rand1 <- hist %>%
  filter(ottaID %in% unique(c(
    cs1_clean$ottaID, cs2_clean$ottaID, cs3_clean$ottaID
  ))) %>%
  distinct(ottaID, revHist) %>%
  group_by(revHist) %>%
  slice_sample(n = 1) %>%
  ungroup()

# Gene expression from random common samples, preserving gene order
cs1_rand1 <- join_avg(cs1_clean, hist_rand1, "ottaID", "keep")
cs2_rand1 <- join_avg(cs2_clean, hist_rand1, "ottaID", "keep")
cs3_rand1 <- join_avg(cs3_clean, hist_rand1, "ottaID", "keep")

# Remove common samples from CS1, preserving gene order
cs1_counts1 <- join_avg(cs1_clean, hist_rand1, "ottaID", "discard")
cs2_counts1 <- join_avg(cs2_clean, hist_rand1, "ottaID", "discard")
cs3_counts1 <- join_avg(cs3_clean, hist_rand1, "ottaID", "discard")

# Normalize by reference method using common samples, add histotypes from annot
cs1_norm_rand1 <-
  refMethod(cs1_counts1, cs3_rand1, cs1_rand1) %>%
  as.data.frame() %>%
  rownames_to_column("ottaID") %>%
  inner_join(hist %>% distinct(ottaID, revHist), by = "ottaID") %>%
  column_to_rownames("ottaID")

cs2_norm_rand1 <-
  refMethod(cs2_counts1, cs3_rand1, cs2_rand1) %>%
  as.data.frame() %>%
  rownames_to_column("ottaID") %>%
  inner_join(hist %>% distinct(ottaID, revHist), by = "ottaID") %>%
  column_to_rownames("ottaID")

# Combined gene expression
counts1 <-
  set_names(list(cs1_counts1, cs2_counts1, cs3_counts1), codesets)
norm_rand1 <- list(cs1_norm_rand1, cs2_norm_rand1, cs3_counts1) %>%
  set_names(codesets) %>%
  map_at(1:2, select, -"revHist")

# Concordance measures for all genes averaged across samples
metrics_non1 <- all_codesets %>%
  imap_dfr(~ {
    pmap_dfr(counts1[.x], ~ cor_stats(.x, .y)) %>% mutate(Sites = .y)
  }) %>%
  gather(key = "Metric", value = "Expression", -Sites) %>%
  group_by(Sites, Metric) %>%
  mutate(Median = paste0("Median = ", scales::number(median(Expression), accuracy = 0.01))) %>%
  ungroup()

metrics_rand1 <- all_codesets %>%
  imap_dfr(~ {
    pmap_dfr(norm_rand1[.x], ~ cor_stats(.x, .y)) %>% mutate(Sites = .y)
  }) %>%
  gather(key = "Metric", value = "Expression", -Sites) %>%
  group_by(Sites, Metric) %>%
  mutate(Median = paste0("Median = ", scales::number(median(Expression), accuracy = 0.01))) %>%
  ungroup()
# Plot all combinations of cross-codeset concordance measure histograms
p_non1 <- ggplot(metrics_non1, aes(Expression)) +
  geom_histogram(bins = 30, fill = "blue") +
  geom_text(aes(x = 0, y = 40, label = Median),
            hjust = 0,
            vjust = 1,
            check_overlap = TRUE) +
  facet_grid(rows = vars(Sites), cols = vars(Metric), scales = "free_x") +
  labs(y = "Count",
       title = "Random1 Non-Normalized Concordance Measure Distributions") +
  theme_bw() +
  theme(panel.grid.minor = element_blank())
print(p_non1)
Random1 Non-Normalized Concordance Measure Distributions

Random1 Non-Normalized Concordance Measure Distributions

p_rand1 <- ggplot(metrics_rand1, aes(Expression)) +
  geom_histogram(bins = 30, fill = "blue") +
  geom_text(aes(x = 0, y = 40, label = Median),
            hjust = 0,
            vjust = 1,
            check_overlap = TRUE) +
  facet_grid(rows = vars(Sites), cols = vars(Metric), scales = "free_x") +
  labs(y = "Count",
       title = "Random1 Normalized Concordance Measure Distributions") +
  theme_bw() +
  theme(panel.grid.minor = element_blank())
print(p_rand1)
Random1 Normalized Concordance Measure Distributions

Random1 Normalized Concordance Measure Distributions

cs1_norm_rand1_byhist <- cs1_norm_rand1 %>%
  split(.$revHist) %>%
  map(select, -"revHist")
cs2_norm_rand1_byhist <- cs2_norm_rand1 %>%
  split(.$revHist) %>%
  map(select, -"revHist")

cs1_counts1_byhist <- cs1_counts1 %>%
  rownames_to_column("ottaID") %>%
  inner_join(hist %>% distinct(ottaID, revHist), by = "ottaID") %>%
  column_to_rownames("ottaID") %>% 
  split(.$revHist) %>%
  map(select, -"revHist")
cs2_counts1_byhist <- cs2_counts1 %>%
  rownames_to_column("ottaID") %>%
  inner_join(hist %>% distinct(ottaID, revHist), by = "ottaID") %>%
  column_to_rownames("ottaID") %>% 
  split(.$revHist) %>%
  map(select, -"revHist")
cs3_counts1_byhist <- cs3_counts1 %>%
  rownames_to_column("ottaID") %>%
  inner_join(hist %>% distinct(ottaID, revHist), by = "ottaID") %>%
  column_to_rownames("ottaID") %>% 
  split(.$revHist) %>%
  map(select, -"revHist")
rand1_cs1_vs_cs3_non <- list(cs1_counts1_byhist, cs3_counts1_byhist) %>%
  transpose() %>%
  map_dfr(~ pmap_df(.x, cor_stats, .id = "gene"), .id = "hist") %>%
  group_by(hist) %>%
  summarize_if(is.double, median) %>%
  ungroup() %>% 
  rename_if(is.double, ~ paste0(., "-Non"))
rand1_cs1_vs_cs3_norm <- list(cs1_norm_rand1_byhist, cs3_counts1_byhist) %>%
  transpose() %>%
  map_dfr(~ pmap_df(.x, cor_stats, .id = "gene"), .id = "hist") %>%
  group_by(hist) %>%
  summarize_if(is.double, median) %>%
  ungroup() %>% 
  rename_if(is.double, ~ paste0(., "-Norm"))
rand1_cs1_vs_cs3 <-
  inner_join(rand1_cs1_vs_cs3_non, rand1_cs1_vs_cs3_norm, by = "hist")

knitr::kable(rand1_cs1_vs_cs3,
             caption = "Random1 CS1 vs. CS3 Median Concordance Measures by Histotypes",
             digits = 2)
Random1 CS1 vs. CS3 Median Concordance Measures by Histotypes
hist R2-Non Ca-Non Rc-Non R2-Norm Ca-Norm Rc-Norm
CCOC 1.00 0.30 0.08 1.00 0.28 0.09
ENOC 1.00 0.54 0.54 1.00 0.61 0.61
HGSC 0.78 0.98 0.85 0.78 0.97 0.86
LGSC 0.97 0.87 0.81 0.97 0.90 0.87
MUC 0.75 0.85 0.68 0.75 0.82 0.64
rand1_cs2_vs_cs3_non <- list(cs2_counts1_byhist, cs3_counts1_byhist) %>%
  transpose() %>%
  map_dfr(~ pmap_df(.x, cor_stats, .id = "gene"), .id = "hist") %>%
  group_by(hist) %>%
  summarize_if(is.double, median) %>%
  ungroup() %>% 
  rename_if(is.double, ~ paste0(., "-Non"))
rand1_cs2_vs_cs3_norm <- list(cs2_norm_rand1_byhist, cs3_counts1_byhist) %>%
  transpose() %>%
  map_dfr(~ pmap_df(.x, cor_stats, .id = "gene"), .id = "hist") %>%
  group_by(hist) %>%
  summarize_if(is.double, median) %>%
  ungroup() %>% 
  rename_if(is.double, ~ paste0(., "-Norm"))
rand1_cs2_vs_cs3 <-
  inner_join(rand1_cs2_vs_cs3_non, rand1_cs2_vs_cs3_norm, by = "hist")

knitr::kable(rand1_cs2_vs_cs3,
             caption = "Random1 CS2 vs. CS3 Median Concordance Measures by Histotypes",
             digits = 2)
Random1 CS2 vs. CS3 Median Concordance Measures by Histotypes
hist R2-Non Ca-Non Rc-Non R2-Norm Ca-Norm Rc-Norm
CCOC 1.00 0.20 0.04 1.00 0.27 0.09
ENOC 1.00 0.67 0.64 1.00 0.64 0.61
HGSC 0.83 0.96 0.86 0.83 0.99 0.89
LGSC 0.99 0.92 0.91 0.99 0.96 0.94
MUC 0.70 0.78 0.55 0.70 0.86 0.57
# Random selection of common samples with equal number of histotypes
set.seed(2020)
hist_rand <- hist %>%
  filter(ottaID %in% unique(c(
    cs1_clean$ottaID, cs2_clean$ottaID, cs3_clean$ottaID
  ))) %>%
  distinct(ottaID, revHist) %>%
  filter(revHist == "HGSC") %>%
  slice_sample(n = 3)

# Gene expression from random common samples, preserving gene order
cs1_rand <- join_avg(cs1_clean, hist_rand, "ottaID", "keep")
cs2_rand <- join_avg(cs2_clean, hist_rand, "ottaID", "keep")
cs3_rand <- join_avg(cs3_clean, hist_rand, "ottaID", "keep")

# Remove common samples from CS1, preserving gene order
cs1_counts <- join_avg(cs1_clean, hist_rand, "ottaID", "discard")
cs2_counts <- join_avg(cs2_clean, hist_rand, "ottaID", "discard")
cs3_counts <- join_avg(cs3_clean, hist_rand, "ottaID", "discard")

# Normalize by reference method using common samples, add histotypes from annot
cs1_norm_rand <-
  refMethod(cs1_counts, cs3_rand, cs1_rand) %>%
  as.data.frame() %>%
  rownames_to_column("ottaID") %>%
  inner_join(hist %>% distinct(ottaID, revHist), by = "ottaID") %>%
  column_to_rownames("ottaID")

cs2_norm_rand <-
  refMethod(cs2_counts, cs3_rand, cs2_rand) %>%
  as.data.frame() %>%
  rownames_to_column("ottaID") %>%
  inner_join(hist %>% distinct(ottaID, revHist), by = "ottaID") %>%
  column_to_rownames("ottaID")

norm_rand <- list(cs1_norm_rand, cs2_norm_rand, cs3_counts) %>%
  set_names(codesets) %>%
  map_at(1:2, select, -"revHist")

metrics_rand <- all_codesets %>%
  imap_dfr(~ {
    pmap_dfr(norm_rand[.x], ~ cor_stats(.x, .y)) %>%
      mutate(Sites = .y)
  }) %>%
  gather(key = "Metric", value = "Expression", -Sites) %>%
  group_by(Sites, Metric) %>%
  mutate(Median = paste0("Median = ", scales::number(median(Expression), accuracy = 0.01))) %>%
  ungroup()
p_rand <- ggplot(metrics_rand, aes(Expression)) +
  geom_histogram(bins = 30, fill = "blue") +
  geom_text(aes(x = 0, y = 40, label = Median),
            hjust = 0,
            vjust = 1,
            check_overlap = TRUE) +
  facet_grid(rows = vars(Sites), cols = vars(Metric), scales = "free_x") +
  labs(y = "Count",
       title = "Random3 HGSC Normalized Concordance Measure Distributions") +
  theme_bw() +
  theme(panel.grid.minor = element_blank())
print(p_rand)
Random3 HGSC Normalized Concordance Measure Distributions

Random3 HGSC Normalized Concordance Measure Distributions

cs1_norm_byhist <- cs1_norm_rand %>%
  split(.$revHist) %>%
  map(select, -"revHist")
cs2_norm_byhist <- cs2_norm_rand %>%
  split(.$revHist) %>%
  map(select, -"revHist")

cs1_byhist <- cs1_counts %>%
  rownames_to_column("ottaID") %>%
  inner_join(hist %>% distinct(ottaID, revHist), by = "ottaID") %>%
  column_to_rownames("ottaID") %>% 
  split(.$revHist) %>%
  map(select, -"revHist")
cs2_byhist <- cs2_counts %>%
  rownames_to_column("ottaID") %>%
  inner_join(hist %>% distinct(ottaID, revHist), by = "ottaID") %>%
  column_to_rownames("ottaID") %>% 
  split(.$revHist) %>%
  map(select, -"revHist")
cs3_byhist <- cs3_counts %>%
  rownames_to_column("ottaID") %>%
  inner_join(hist %>% distinct(ottaID, revHist), by = "ottaID") %>%
  column_to_rownames("ottaID") %>% 
  split(.$revHist) %>%
  map(select, -"revHist")
rand3_hgsc_cs1_vs_cs3_non <- list(cs1_byhist, cs3_byhist) %>%
  transpose() %>%
  map_dfr(~ pmap_df(.x, ~ cor_stats(.x, .y), .id = "gene"),
          .id = "hist") %>%
  group_by(hist) %>%
  summarize_if(is.double, median) %>%
  ungroup() %>% 
  rename_if(is.double, ~ paste0(., "-Non"))
rand3_hgsc_cs1_vs_cs3_norm <- list(cs1_norm_byhist, cs3_byhist) %>%
  transpose() %>%
  map_dfr(~ pmap_df(.x, ~ cor_stats(.x, .y), .id = "gene"),
          .id = "hist") %>%
  group_by(hist) %>%
  summarize_if(is.double, median) %>%
  ungroup() %>% 
  rename_if(is.double, ~ paste0(., "-Norm"))
rand3_hgsc_cs1_vs_cs3 <- 
  inner_join(rand3_hgsc_cs1_vs_cs3_non, rand3_hgsc_cs1_vs_cs3_norm, by = "hist")

knitr::kable(rand3_hgsc_cs1_vs_cs3, caption = "Random3 HGSC CS1 vs. CS3 Median Concordance Measures by Histotypes", digits = 2)
Random3 HGSC CS1 vs. CS3 Median Concordance Measures by Histotypes
hist R2-Non Ca-Non Rc-Non R2-Norm Ca-Norm Rc-Norm
CCOC 0.67 0.58 0.29 0.67 0.63 0.26
ENOC 0.89 0.80 0.71 0.89 0.80 0.74
HGSC 0.77 0.98 0.85 0.77 0.99 0.87
LGSC 0.94 0.85 0.79 0.94 0.88 0.83
MUC 0.76 0.93 0.73 0.76 0.93 0.79
rand3_hgsc_cs2_vs_cs3_non <- list(cs2_byhist, cs3_byhist) %>%
  transpose() %>%
  map_dfr(~ pmap_df(.x, ~ cor_stats(.x, .y), .id = "gene"),
          .id = "hist") %>%
  group_by(hist) %>%
  summarize_if(is.double, median) %>%
  ungroup() %>% 
  rename_if(is.double, ~ paste0(., "-Non"))
rand3_hgsc_cs2_vs_cs3_norm <- list(cs2_norm_byhist, cs3_byhist) %>%
  transpose() %>%
  map_dfr(~ pmap_df(.x, ~ cor_stats(.x, .y), .id = "gene"),
          .id = "hist") %>%
  group_by(hist) %>%
  summarize_if(is.double, median) %>%
  ungroup() %>% 
  rename_if(is.double, ~ paste0(., "-Norm"))
rand3_hgsc_cs2_vs_cs3 <- 
  inner_join(rand3_hgsc_cs2_vs_cs3_non, rand3_hgsc_cs2_vs_cs3_norm, by = "hist")

knitr::kable(rand3_hgsc_cs2_vs_cs3, caption = "Random3 HGSC CS2 vs. CS3 Median Concordance Measures by Histotypes", digits = 2)
Random3 HGSC CS2 vs. CS3 Median Concordance Measures by Histotypes
hist R2-Non Ca-Non Rc-Non R2-Norm Ca-Norm Rc-Norm
CCOC 0.69 0.54 0.35 0.69 0.64 0.39
ENOC 0.85 0.77 0.66 0.85 0.86 0.76
HGSC 0.82 0.96 0.86 0.82 0.99 0.90
LGSC 0.98 0.95 0.92 0.98 0.94 0.92
MUC 0.77 0.89 0.73 0.77 0.92 0.70

Pools Method

CodeSet2 contains 12 ref pool samples (Pool 1 = 4, Pool 2 = 4, Pool 3 = 4). CodeSet3 contains 22 ref pool samples (Pool 1 = 12, Pool 2 = 5, Pool 3 = 5). n=86 common samples.

CodeSet2 is calibrated to CodeSet3 as follows:
X^2(norm) = X^2 - R^2 + R^3
X^3(norm) = X^3

# Pool set: pool samples from CS2
pool_samples <- cohorts %>% 
  filter(file_source == "cs2", cohort == "POOL-CTRL") %>% 
  pull(col_name)
cs2_pools <- select(cs2_norm, Name, all_of(pool_samples))

# Locked down reference pools weights
weights <- c("Pool1", "Pool2", "Pool3") %>%
  set_names() %>%
  map_dbl(~ mean(grepl(., names(ref_pools), ignore.case = TRUE)))

# Weighted mean gene expression for CS2 pools (using reference pools weights)
cs2_pools_mgx <- weights %>%
  imap_dfc(~ .x * rowMeans(select(cs2_pools, matches(.y)))) %>%
  transmute(Name = cs2_pools[["Name"]], cs2_exp = rowSums(.))

# Mean gene expression for CS3 reference pools
cs3_pools_mgx <-
  enframe(rowMeans(ref_pools), name = "Name", value = "cs3_exp")

# Extract cs2 norm counts (not pools)
cs2_norm_counts <- cs2_norm %>% select(-one_of(names(cs2_pools)[-1]))

# Normalize each gene by adding batch effect (diff in mean gx)
cs2_normalized_data_pools <-
  inner_join(cs2_pools_mgx, cs3_pools_mgx, by = "Name") %>%
  transmute(Name, be = cs3_exp - cs2_exp) %>%
  inner_join(cs2_norm_counts, by = "Name") %>%
  gather(FileName, exp, -1:-4) %>%
  transmute(Name = fct_inorder(Name), FileName, exp = be + exp) %>%
  spread(Name, exp)

## Summary
# CS2 pools: 9 samples, 365 genes # dim(t(cs2_pools))
# CS3 pools: 22 samples, 513 genes # dim(t(ref_pools))
# CS2 normalized: 891 samples (903 original - 12 from pools), 136 common genes # dim(cs2_normalized_data_pools)


# Keep only common samples between codesets, average out duplicates
tmp2 <- cs2_norm_counts %>%
  gather(FileName, exp, -1:-3) %>%
  mutate(FileName = gsub("^X", "", FileName)) %>%
  inner_join(hist, by = "FileName") %>% 
  filter(Name %in% names(cs2_normalized_data_pools)[-1]) %>%
  mutate(Name = factor(Name, levels = names(cs2_normalized_data_pools)[-1])) %>%
  select(Name, ottaID, revHist, exp) %>% 
  group_by(Name, ottaID, revHist) %>%
  summarize(exp = mean(exp)) %>%
  ungroup() %>%
  spread(Name, exp)
## `summarise()` regrouping output by 'Name', 'ottaID' (override with `.groups` argument)
tmp3 <- cs3_norm %>%
  select(-one_of(names(ref_pools))) %>%
  gather(FileName, exp, -1:-3) %>%
  mutate(FileName = gsub("^X", "", FileName)) %>%
  inner_join(hist, by = "FileName") %>% 
  filter(Name %in% names(cs2_normalized_data_pools)[-1]) %>%
  mutate(Name = factor(Name, levels = names(cs2_normalized_data_pools)[-1])) %>%
  select(Name, ottaID, revHist, exp) %>% 
  group_by(Name, ottaID, revHist) %>%
  summarize(exp = mean(exp)) %>%
  ungroup() %>%
  spread(Name, exp)
## `summarise()` regrouping output by 'Name', 'ottaID' (override with `.groups` argument)
cs2_common_non <- semi_join(tmp2, tmp3, by = "ottaID")
cs3_common_non <- semi_join(tmp3, tmp2, by = "ottaID")

# Extract cs2 common pools-normalized
cs2_common_pools <- cs2_normalized_data_pools %>%
  mutate(FileName = gsub("^X", "", FileName)) %>%
  inner_join(hist, by = "FileName") %>% 
  group_by(ottaID, revHist) %>%
  summarize_if(is.double, mean) %>%
  ungroup() %>% 
  semi_join(cs3_common_non, by = "ottaID")

# Combined gene expression
sets <- c("CS2Non", "CS2Pools", "CS3Non")
all_comps <- combn(sets, 2) %>%
  as_tibble() %>%
  set_names(map_chr(., paste, collapse = "_vs_"))
common_gx <-
  set_names(list(cs2_common_non, cs2_common_pools, cs3_common_non), sets) %>%
  map(column_to_rownames, "ottaID") %>% 
  map(select, -"revHist")

# Concordance measures for all genes averaged across samples
metrics_pools <- all_comps %>%
  imap_dfr(~ {
    pmap_dfr(common_gx[.x], ~ cor_stats(.x, .y)) %>% mutate(Sites = .y)
  }) %>% 
  gather(key = "Metric", value = "Expression", -Sites) %>%
  group_by(Sites, Metric) %>%
  mutate(Median = paste0("Median = ", scales::number(median(Expression), accuracy = 0.01))) %>%
  ungroup() %>% 
  mutate(Sites = factor(Sites, levels = names(all_comps)))
p_cs2non_vs_cs2pools <-
  ggplot(metrics_pools %>% filter(Sites == "CS2Non_vs_CS2Pools"),
         aes(Expression)) +
  geom_histogram(bins = 30, fill = "blue") +
  geom_text(aes(x = 0, y = 120, label = Median),
            hjust = 0,
            check_overlap = TRUE) +
  facet_grid(cols = vars(Metric), scales = "free_x") +
  labs(y = "Count",
       title = "CS2Non vs. CS2Pools Concordance Measure Distributions") +
  theme_bw() +
  theme(panel.grid.minor = element_blank())
print(p_cs2non_vs_cs2pools)
CS2Non vs. CS2Pools Concordance Measure Distributions

CS2Non vs. CS2Pools Concordance Measure Distributions

p_cs2non_vs_cs3 <-
  ggplot(metrics_pools %>% filter(Sites == "CS2Non_vs_CS3Non"),
         aes(Expression)) +
  geom_histogram(bins = 30, fill = "blue") +
  geom_text(aes(x = 0, y = 40, label = Median),
            hjust = 0,
            check_overlap = TRUE) +
  facet_grid(cols = vars(Metric), scales = "free_x") +
  labs(y = "Count",
       title = "CS2 Non-Normalized Pools vs. CS3 Concordance Measure Distributions") +
  theme_bw() +
  theme(panel.grid.minor = element_blank())
print(p_cs2non_vs_cs3)
CS2 Non-Normalized Pools vs. CS3 Concordance Measure Distributions

CS2 Non-Normalized Pools vs. CS3 Concordance Measure Distributions

p_cs2pools_vs_cs3 <-
  ggplot(metrics_pools %>% filter(Sites == "CS2Pools_vs_CS3Non"),
         aes(Expression)) +
  geom_histogram(bins = 30, fill = "blue") +
  geom_text(aes(x = 0, y = 40, label = Median),
            hjust = 0,
            check_overlap = TRUE) +
  facet_grid(cols = vars(Metric), scales = "free_x") +
  labs(y = "Count",
       title = "CS2 Normalized Pools vs. CS3 Concordance Measure Distributions") +
  theme_bw() +
  theme(panel.grid.minor = element_blank())
print(p_cs2pools_vs_cs3)
CS2 Normalized Pools vs. CS3 Concordance Measure Distributions

CS2 Normalized Pools vs. CS3 Concordance Measure Distributions

cs2_common_non_byhist <- cs2_common_non %>%
  split(.$revHist) %>%
  map(column_to_rownames, "ottaID") %>% 
  map(select, -"revHist")
cs2_common_pools_byhist <- cs2_common_pools %>%
  split(.$revHist) %>%
  map(column_to_rownames, "ottaID") %>% 
  map(select, -"revHist")
cs3_common_non_byhist <- cs3_common_non %>%
  split(.$revHist) %>%
  map(column_to_rownames, "ottaID") %>% 
  map(select, -"revHist")
pools_cs2non_vs_cs3 <- list(cs2_common_non_byhist,
                            cs3_common_non_byhist) %>%
  transpose() %>%
  map_dfr(~ pmap_df(.x, cor_stats, .id = "gene"), .id = "hist") %>%
  group_by(hist) %>%
  summarize_if(is.double, median) %>%
  ungroup()

knitr::kable(pools_cs2non_vs_cs3,
             caption = "Pools Non-Normalized CS2 vs. CS3 Median Concordance Measures by Histotypes",
             digits = 2)
Pools Non-Normalized CS2 vs. CS3 Median Concordance Measures by Histotypes
hist R2 Ca Rc
CCOC 0.66 0.51 0.29
ENOC 0.87 0.74 0.64
HGSC 0.77 0.94 0.80
LGSC 0.98 0.94 0.92
MUC 0.75 0.86 0.69
pools_cs2norm_vs_cs3 <- list(cs2_common_pools_byhist,
                             cs3_common_non_byhist) %>%
  transpose() %>%
  map_dfr(~ pmap_df(.x, cor_stats, .id = "gene"), .id = "hist") %>%
  group_by(hist) %>%
  summarize_if(is.double, median) %>%
  ungroup()

knitr::kable(pools_cs2norm_vs_cs3,
             caption = "Pools Normalized CS2 vs. CS3 Median Concordance Measures by Histotypes",
             digits = 2)
Pools Normalized CS2 vs. CS3 Median Concordance Measures by Histotypes
hist R2 Ca Rc
CCOC 0.66 0.62 0.31
ENOC 0.87 0.74 0.67
HGSC 0.77 0.94 0.82
LGSC 0.98 0.96 0.93
MUC 0.75 0.91 0.70

Common Sample Distributions

common_dist_all <- hist %>% 
  filter(ottaID %in% c(
    cs1_clean$ottaID, cs2_clean$ottaID, cs3_clean$ottaID
  )) %>% 
  count(CodeSet, revHist) %>% 
  pivot_wider(names_from = "CodeSet", values_from = "n")
knitr::kable(common_dist_all, caption = "All Common Samples Histotype Distribution")
All Common Samples Histotype Distribution
revHist CS1 CS2 CS3
CCOC 3 4 9
ENOC 4 4 9
HGSC 59 62 95
LGSC 7 5 8
MUC 7 5 11
common_dist_distinct <- hist %>% 
  filter(ottaID %in% c(
    cs1_clean$ottaID, cs2_clean$ottaID, cs3_clean$ottaID
  )) %>% 
  distinct(ottaID, CodeSet, revHist) %>% 
  count(CodeSet, revHist) %>% 
  pivot_wider(names_from = "CodeSet", values_from = "n")
knitr::kable(common_dist_distinct, caption = "Distinct Common Samples Histotype Distribution")
Distinct Common Samples Histotype Distribution
revHist CS1 CS2 CS3
CCOC 3 3 3
ENOC 3 3 3
HGSC 57 57 57
LGSC 4 4 4
MUC 5 5 5